################################################################################ 
#
# Replication code for:
# Empathy-based counterspeech can reduce racist hate speech
# in a social media field experiment
#
# PNAS 2021
# 
# 
# Additional Unpublished Analysis
# --------------------------------
# 
# In this script, we do additional analysis on the effect of the empathy treatment
# on the amount of total tweets. We show that the effect is robust and mostly driven
# by heavy users
# 
################################################################################ 

rm(list = ls())
setsave = F

################################################################################ 
#  LIBRARIES
################################################################################ 

library(stargazer)
library(dplyr)
library(tidyr)
library(readr)
library(estimatr)
library(xtable)
library(quantreg)


################################################################################ 
#   DATA AND FOLDER
################################################################################ 

wd = dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path)))
wd_res = paste0(wd, '/results')
wd_data =  paste0(wd, '/data')


# Upload the data
setwd(wd_data)
data = read.csv('main_dataset.csv') %>% as.data.frame()
data = data[(is.na(data$deleted_suspended)), ]


# Impute missing as averages
for(i in 1:ncol(data)){
  data[is.na(data[,i]), i] <- mean(data[,i], na.rm = TRUE)
}


################################################################################ 
#   VARIABLES
################################################################################

data$empathy_dummy = ifelse(data$treatment_main==3, 1, 0)
data$consequences_dummy = ifelse(data$treatment_main==2, 1, 0)
data$meme_dummy = ifelse(data$treatment_main==1, 1, 0)

# keep only emphaty
df = data[data$treatment_main %in% c(0,3),]


################################################################################ 
#  OLS REGRESSIONS without controls
################################################################################ 


container = list()

container[[1]] = lm_robust(num_tweets_post ~ treatment_dummy, data=df)
container[[2]] = lm_robust(num_tweets_post ~ treatment_dummy + num_tweets_d1m, data=df)
container[[3]] = lm_robust(num_tweets_post ~ treatment_dummy + num_tweets_d1m + num_tweets_w1m, data=df)
container[[4]] = lm_robust(num_tweets_post ~ treatment_dummy + num_tweets_d1m + num_tweets_w1m + num_tweets_w2m, data=df)
container[[5]] = lm_robust(num_tweets_post ~ treatment_dummy + num_tweets_d1m + num_tweets_w1m + num_tweets_w2m + num_tweets_w3m, data=df)
container[[6]] = lm_robust(num_tweets_post ~ treatment_dummy + num_tweets_d1m + num_tweets_w1m + num_tweets_w2m + num_tweets_w3m + num_tweets_w4m, data=df)

sapply(container, function(x) c(x$coefficients[2], x$std.error[2], x$p.value[2], x$nobs, x$r.squared)) %>% round(3) %>% write.csv(paste0(wd_res, "/reg_rr.csv"))


################################################################################ 
#   Quantile regression
################################################################################ 

# with rank
formula = as.formula(num_tweets_post ~ treatment_dummy)
numb = seq(0, 1, .2)

container = list()
count =0
for (n in numb){
  print(paste0('--- Quintile ', n, '----'))
  count = count + 1
  container[[count]] = rq(formula, data=df, tau=n) %>% summary.rq()
  }
sapply(container, function(x) round(c(x$coefficients[2], x$coefficients[4], x$coefficients[6]), 3)) %>% write.csv(paste0(wd_res, "/reg_quant1.csv"))


# bootstrapping
container = list()
count =0
for (n in numb){
  print(paste0('--- Quintile ', n, '----'))
  count = count + 1
  container[[count]] = rq(formula, data=df, tau=n) %>% summary.rq(se='boot')
}
sapply(container, function(x) round(c(x$coefficients[2], x$coefficients[4], x$coefficients[8]), 3))  %>% write.csv(paste0(wd_res, "/reg_quant2.csv"))


################################################################################ 
# Histograms
################################################################################ 

hist(data$num_tweets_post[data$treatment_main==0 & data$num_tweets_post<3000],
     breaks=50, col=rgb(0,0,1,0.5), ylim=c(0,0.0058), freq=F,
     main='Density of Total Tweets in\nControl (blue) and Empathy (red) groups',
     xlab='Number of Tweets after the intervention')
hist(data$num_tweets_post[data$treatment_main==3 & data$num_tweets_post<3000], breaks=50, col=rgb(1,0,0,0.5), add=T, freq=F)
